Pilot_Analysis

General analysis plan

Our analysis will evaluate the pilot outcomes for the Perceived Social Categorization Study. Participants rated the same set of 85 photos for perceived “Jewishness” and “Arabness.” Our objective is to pinpoint images where the average ratings across these two dimensions do not significantly diverge. This nuanced approach enables us to select images that best represent a balanced perception, laying a robust foundation for our main study and ensuring the integrity and relevance of our visual stimuli.

Data importation and initial preparation

helper function’s

standardize column names

Utilize the provided helper function colnames_to_underscores to standardize column names by replacing spaces and special characters with underscores.

Code
colnames_to_underscores <- function(data = NULL) {
  dat <- data
  names(dat) <- stringr::str_replace_all(names(dat), pattern = " ", replacement = "_")
  return(dat)
  }

get_summary_stats_function

Code
get_summary_stats <- function(demo_wide_clean) {
  summary_stats <- summary(demo_wide_clean)
  return(summary_stats)
}

Data Importation and Initial Preparation:

Import data sets related to the main task, attention checks, and demographics.

Code
data_categorization_jwish_first <- read_csv("../Data/data_exp_143127-v17_task-jwishfirstrealdata.csv", show_col_types = FALSE)
data_categorization_Arab_first <- read_csv("../Data/data_exp_143127-v17_task-4qo7Arabfirstrealdata.csv", show_col_types = FALSE)
  
att_check <-  read_csv("../Data/data_exp_143127-v17_task-sfst_ATTcheck.csv", show_col_types = F)

data_demo <- read_csv("../Data/data_exp_143127-v17_questionnaire-jj6n_demo_all_long_for.csv", show_col_types = F)

Attention Check and Participant Filtering:

Data arrangement(Binding- will not be nessasery in the actual analysis)

Code
att_check <- att_check|>
  colnames_to_underscores() |>
  dplyr::filter(str_detect(Zone_Type, pattern = "endValue")) |>
  select(Participant_Private_ID, Response) |>
  mutate(Participant_Private_ID = factor(Participant_Private_ID))

Identify and exclude participants who failed the attention checks

Identifying who failed the attention checks (answer > 5)

Code
failed_IDs <- att_check |>
  dplyr::filter(Response > 5) |>
  select(Participant_Private_ID)

Data cleaning and Outlier Detection:

Cleaning data before filtering outliers

  • Merge task-related data from different data- sets and filter out participants who failed the attention checks.
  • Organizing the primary data for participants, including labeling and introducing a column to denote the order of conditions.
Code
data_participants <- rbind(data_categorization_Arab_first,data_categorization_jwish_first)|>
  colnames_to_underscores() |>
  dplyr::filter(!(Participant_Private_ID %in% failed_IDs$Participant_Private_ID)) |>
  dplyr::filter(display %in% c("task_Jewish", "task_Arab")) |> # removing instructions screens
  dplyr::filter(Zone_Type == "response_slider_endValue") |> # only subjects answers
  select(Participant_Private_ID, Response, image, Reaction_Time, display, Task_Name) |>
  mutate(Participant_Private_ID = factor(Participant_Private_ID),
         image = factor(image),
         Task_Name = factor(Task_Name),
         display = factor(display)) |>
  mutate(Task_Name =  case_when(
    Task_Name == "Group_categorization_JewishFirst_pilot2" ~ "JewishFirst",
    Task_Name == "Group_categorization_ArabFirst_pilot2" ~ "ArabFirst",
    TRUE ~ Task_Name  
  )) |>
  rename(order_of_conditions = Task_Name)

num_participants <- n_distinct(data_participants$Participant_Private_ID)

visualizing responses by display

Density plots for participants’ responses based on the dysplay conditions, labeled as “task Arab” versus “task Jewish.”

Code
density_plot2 <- ggplot(data_participants, aes(x = Response, fill = display)) + 
  geom_density(alpha = 0.5) + # Plot density
  geom_rug(aes(color = display), sides = "b") + # Add rug plot at the bottom
  scale_fill_brewer(palette = "Pastel1") + # Use Pastel1 palette for fill
  scale_color_brewer(palette = "Pastel1") + # Use Pastel1 palette for rug and mean line colors
  theme_minimal() + 
  labs(title = "Density of Ratings by Display", x = "Rating", y = "Density") +
  geom_vline(data = data_participants %>% group_by(display) |> 
               summarise(mean_response = mean(Response, na.rm = TRUE)),
             aes(xintercept = mean_response),
             linetype = "dashed", color = "black", size = 0.5) 
print(density_plot2)

Subjects responses by display
Code
ggsave("density_plot_with_all.png", density_plot2, path = "../Plots/", width = 10, height = 8, units = "in", bg = "white")

Detecting outlires

Identify and exclude outliers from our data set using The MAD-median rule for outlier removal as recommended by Bakker and Wicherts (2014).

  • b is a scaling factor for the MAD to make it consistent for the normal distribution (the default is 1.4826
  • threshold is the cutoff value above which values are considered outliers (default is 3) Second option to pbtain the MAD MEDIAN ROLE for detecting outlires (better in my opinion) link.
Code
#Threshold Determiantion

threshold <- 2.24
# Calculate the median and MAD for the Response column
median_response <- median(data_participants$Response, na.rm = TRUE)
mad_response <- mad(data_participants$Response, constant = 1, na.rm = TRUE)

lower_bound <- median_response - threshold * mad_response
upper_bound <- median_response + threshold * mad_response

outlier_indices <- which(data_participants$Response < lower_bound | data_participants$Response > upper_bound)


data_participants$is_outlier <- ifelse(data_participants$Response < lower_bound | data_participants$Response > upper_bound, 1, 0)

outliers <- data_participants[outlier_indices, ]
data_cleaned <- data_participants[!data_participants$Response %in% outliers$Response, ]



  set.seed(14)

# test <- data_participants |>
#   #mutate(is_outlier = sample(x = c(0, 1), size = nrow(data_participants), replace = T, prob = c(.8, .2))) |>
#   filter(is_outlier == 1) |>
#   group_by(Participant_Private_ID) |>
#   mutate(n_trials = n()) |>
#   mutate(bad_trials = 85*2 - n_trials) |>
#   mutate(percent_bad_trials = n_bad_trials / (85*2)) 
#   #filter(percent_bad_trials <= 0.2)
# library(dplyr)  

test1 <- data_participants |>
    filter(is_outlier == 1) |>
    group_by(Participant_Private_ID) |>
    mutate(bad_trials = n(), 
           #bad_trials = 85 * 2 - n_trials, 
           percent_bad_trials = bad_trials / (85 * 2))

num_participants_out <- n_distinct(test1$Participant_Private_ID)

SD table

A table showing the average standard deviation of each subject’s ratings beyond display types

Code
participant_sd_ratings <- data_participants |>
  group_by(Participant_Private_ID) |>
  summarise(SD_of_Ratings = sd(Response, na.rm = TRUE)) |>
  ungroup()

kable(participant_sd_ratings, caption = "Standard Deviation of Ratings for Each Participant")
Standard Deviation of Ratings for Each Participant
Participant_Private_ID SD_of_Ratings
10514858 19.72960
10515072 41.73836
10515173 23.39740
10515193 31.32064
10515201 36.83332
10515243 40.58422
10515319 35.68966
10515327 32.17920
10515904 47.55128
10515918 22.24714
10515942 10.57305
10515960 19.62875
10515998 35.48673
10516258 24.88498
10516270 25.19615
10516302 29.10667
10516415 32.04238
10516645 22.79964
10516756 20.96220
10517522 29.27106
10517752 37.97194
10517925 40.37224
10518377 24.35308
10519057 37.29496
10519217 11.59149
10519319 29.61986
10519805 32.12981
10520066 21.90053
10520207 32.73827
10520244 25.67647
10520416 28.98994
10520443 26.83289
10520649 31.50339
10520738 28.81733
10522475 29.30831
10522511 18.49479
10522561 27.52098
10527960 31.49570
10528079 22.82760
10528402 12.17067
10528576 30.03514
10529232 36.92276
10530076 37.41983
10530576 20.11152
10530858 29.53245
10531834 23.79220

examining order effect:

Examining how the order of conditions affects ratings of images as “Arab” or “Jewish,” to ensure there is no influence of presentation sequence on perceptions. Visualization of order effect

Code
# Visualization of order effects
order_effect_plot <- ggplot(data_participants, aes(x = order_of_conditions, y = Response, fill = display)) +
  geom_boxplot() +
  stat_summary(fun = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), width = 0.75, color = "red") +
  facet_wrap(~display, scales = "free") +
  labs(title = "Order Effect on Ratings",
       x = "Order of Conditions",
       y = "Rating") +
  theme_minimal() +
  theme(plot.background = element_rect(fill = "white"), # Set plot background to white
        panel.background = element_rect(fill = "white"), # Ensure panel background is white
        text = element_text(color = "black")) + # Ensure text is black
  scale_fill_brewer(palette = "Pastel1")

order_effect_plot

Code
ggsave("order_effect_with_all.png", order_effect_plot, path = "../Plots/", width = 4000, height = 4000, units = "px")

Perform a t-test to see if there’s a significant difference in ratings between orders

Code
mean_ratings_by_order <- data_participants |>
  group_by(order_of_conditions, display) |>
  summarise(mean_rating = mean(Response, na.rm = TRUE), .groups = 'drop') |>
  pivot_wider(names_from = display, values_from = mean_rating)


t_test_result_jewish <- t.test(Response ~ order_of_conditions,
                               data = dplyr::filter(data_participants, display == "task_Jewish"),
                               alternative = "two.sided")

t_test_result_arab <- t.test(Response ~ order_of_conditions,
                             data = dplyr::filter(data_participants, display == "task_Arab"),
                             alternative = "two.sided")

T-test result

Code
t_test_results <- data.frame(
  Display = c("Jewish", "Arab"),
  Statistic = c(t_test_result_jewish$statistic, t_test_result_arab$statistic),
  P_Value = c(t_test_result_jewish$p.value, t_test_result_arab$p.value)  # Difference of means, NA for the second row
)

# Create a table from the results
kable(t_test_results, caption = "T-Test Results for Jewish and Arab Displays", format = "markdown")
T-Test Results for Jewish and Arab Displays
Display Statistic P_Value
Jewish 1.9062385 0.0566926
Arab -0.7360867 0.4617223
Code
markdown_table <- kable(t_test_results, caption = "T-Test Results for Jewish and Arab Displays", format = "markdown")
writeLines(markdown_table, "t_test_results.md")

Main task

Difference of means per image by display with a cutoff of 10 points difference between the ratings

  • Analyze the main task by calculating and comparing mean responses for different image categories.
  • Identify significant differences in means and categorize images based on these differences.
Code
data_images_10<- data_participants |>
  group_by(image, Participant_Private_ID) |>
  #dplyr::filter(!(Participant_Private_ID %in% exclude_participant_IDs))|>
  dplyr::summarize(
    task_Jewish = mean(Response[display == "task_Jewish"], na.rm = TRUE),
    task_Arab = mean(Response[display == "task_Arab"], na.rm = TRUE),
    .groups = 'drop'  ) |> # Calculate the difference in ratings for each participant and image
  mutate(diff_per_participant = task_Jewish - task_Arab) |>
  # Aggregate at the image level
  group_by(image) |>
  dplyr::summarize(
    avg_diff = mean(diff_per_participant, na.rm = TRUE),
    .groups = 'drop') |># Classify based on the average difference
  mutate(
    rated_ethnicity = case_when(
      avg_diff < -10 ~ "Arab",
      avg_diff > 10 ~ "Jewish",
      TRUE ~ "Ambiguous"
    )
  )|>
  mutate(avg_diff = abs(avg_diff)) |>
  arrange(avg_diff)

data_images_big_diff_10 <- data_images_10 |>
  dplyr::filter(abs(avg_diff) >= 10) |>
  mutate(rated_ethnicity = case_when(avg_diff < 0 ~ "Arab",
                                     avg_diff > 0 ~ "Jewish",
                                     .default = NA))


data_images_choosen_10 <- data_images_10 |>
  dplyr::filter(abs(avg_diff)<10)|>
  mutate(rated_ethnicity = case_when(avg_diff < 0 ~ "Arab",
                                     avg_diff > 0 ~ "Jewish",
                                     .default = NA))
kable(data_images_10, caption = "Difference of means with a cutoff of 10 points")
Difference of means with a cutoff of 10 points
image avg_diff rated_ethnicity
CFD_M-212-N.png 0.0000000 Ambiguous
CFD_M-242-N.png 0.0217391 Ambiguous
CFD_M-211-N.png 0.0434783 Ambiguous
CFD_M-234-N.png 0.7173913 Ambiguous
IFD_M-086-N.png 0.8913043 Ambiguous
CFD_M-227-N.png 1.3260870 Ambiguous
IFD_M-018-N.png 1.3260870 Ambiguous
CFD_M-214-N.png 2.1304348 Ambiguous
CFD_M-236-N.png 2.2391304 Ambiguous
IFD_M-105-N.png 2.5869565 Ambiguous
CFD_M-206-N.png 3.6304348 Ambiguous
CFD_M-218-N.png 4.2391304 Ambiguous
CFD_M-220-N.png 4.3695652 Ambiguous
CFD_M-253-N.png 4.7391304 Ambiguous
CFD_M-248-N.png 5.1956522 Ambiguous
IFD_M-421-N.png 5.2826087 Ambiguous
IFD_M-132-N.png 5.6086957 Ambiguous
IFD_M-419-N.png 6.2608696 Ambiguous
CFD_M-237-N.png 6.4130435 Ambiguous
IFD_M-135-N.png 8.8043478 Ambiguous
IFD_M-108-N.png 9.0000000 Ambiguous
CFD_M-216-N.png 9.3695652 Ambiguous
IFD_M-036-N.png 9.4565217 Ambiguous
IFD_M-416-N.png 9.9130435 Ambiguous
CFD_M-224-N.png 9.9347826 Ambiguous
CFD_M-243-N.png 10.0217391 Arab
CFD_M-247-N.png 10.7608696 Arab
IFD_M-117-N.png 10.8043478 Jewish
IFD_M-122-N.png 11.0217391 Jewish
CFD_M-229-N.png 11.1086957 Arab
IFD_M-067-N.png 11.1521739 Arab
CFD_M-231-N.png 11.3478261 Jewish
IFD_M-100-N.png 11.3913043 Jewish
CFD_M-225-N.png 11.5217391 Arab
IFD_M-121-N.png 11.9347826 Jewish
CFD_M-204-N.png 13.4130435 Jewish
CFD_M-222-N.png 14.3043478 Jewish
IFD_M-418-N.png 14.4130435 Jewish
IFD_M-033-N.png 14.5652174 Jewish
IFD_M-424-N.png 15.0000000 Arab
IFD_M-441-N.png 15.4565217 Jewish
CFD_M-251-N.png 15.5652174 Jewish
IFD_M-136-N.png 15.6739130 Arab
IFD_M-021-N.png 15.8478261 Jewish
CFD_M-221-N.png 16.6956522 Jewish
CFD_M-213-N.png 17.2391304 Arab
CFD_M-230-N.png 17.5434783 Arab
CFD_M-200-N.png 17.9782609 Jewish
IFD_M-015-N.png 18.3260870 Jewish
IFD_M-075-N.png 19.5652174 Arab
IFD_M-113-N.png 19.5652174 Jewish
IFD_M-062-N.png 19.5869565 Arab
IFD_M-420-N.png 19.6086957 Arab
CFD_M-246-N.png 19.8478261 Arab
CFD_M-223-N.png 19.9130435 Arab
IFD_M-044-N.png 20.5217391 Arab
IFD_M-087-N.png 21.6304348 Jewish
CFD_M-252-N.png 22.3260870 Arab
IFD_M-051-N.png 23.0217391 Arab
IFD_M-042-N.png 23.1086957 Arab
IFD_M-035-N.png 23.4130435 Jewish
CFD_M-239-N.png 24.5000000 Arab
CFD_M-238-N.png 25.4565217 Arab
CFD_M-210-N.png 26.6739130 Arab
IFD_M-017-N.png 27.1956522 Jewish
IFD_M-111-N.png 28.5000000 Arab
CFD_M-232-N.png 28.6739130 Arab
IFD_M-114-N.png 29.5652174 Jewish
IFD_M-032-N.png 29.6304348 Arab
IFD_M-097-N.png 29.7826087 Arab
IFD_M-084-N.png 29.8478261 Arab
IFD_M-020-N.png 31.1521739 Arab
CFD_M-250-N.png 31.6304348 Jewish
CFD_M-235-N.png 32.0217391 Arab
IFD_M-049-N.png 33.4347826 Arab
IFD_M-028-N.png 36.5217391 Jewish
CFD_M-201-N.png 39.1086957 Jewish
IFD_M-069-N.png 41.6956522 Arab
IFD_M-423-N.png 44.1086957 Arab
IFD_M-107-N.png 49.4130435 Arab
CFD_M-202-N.png 50.2608696 Arab
IFD_M-066-N.png 60.5000000 Arab
IFD_M-039-N.png 61.4565217 Arab
IFD_M-045-N.png 61.7391304 Arab
IFD_M-046-N.png 70.9347826 Arab
Code
saveRDS(data_images_10, file = "C:/Users/97252/Documents/GitHub/face_cater/PILOT- 1 face categorization/Data/final_data.rdx")

Summary Table:

Code
summary_table <- data_images_10 |>
  count(rated_ethnicity) |>
  spread(key = rated_ethnicity, value = n)

# Print the summary table
print(summary_table)
# A tibble: 1 × 3
  Ambiguous  Arab Jewish
      <int> <int>  <int>
1        25    37     23

Difference of means per image by display with a cutoff of with a cutoff of 15 points difference between the ratings

Code
data_images_15<- data_participants |>
  group_by(image, Participant_Private_ID) |>
  #dplyr::filter(!(Participant_Private_ID %in% exclude_participant_IDs))|>
  dplyr::summarize(
    task_Jewish = mean(Response[display == "task_Jewish"], na.rm = TRUE),
    task_Arab = mean(Response[display == "task_Arab"], na.rm = TRUE),
    .groups = 'drop'  ) |> # Calculate the difference in ratings for each participant and image
  mutate(diff_per_participant = task_Jewish - task_Arab) |>
  # Aggregate at the image level
  group_by(image) |>
  dplyr::summarize(
    avg_diff = mean(diff_per_participant, na.rm = TRUE),
    .groups = 'drop') |># Classify based on the average difference
  mutate(
    rated_ethnicity = case_when(
      avg_diff < -15 ~ "Arab",
      avg_diff > 15 ~ "Jewish",
      TRUE ~ "Ambiguous"
    )
  )|>
  mutate(avg_diff = abs(avg_diff)) |>
  arrange(avg_diff)

data_images_big_diff_15 <- data_images_15 |>
  dplyr::filter(abs(avg_diff) >= 15) |>
  mutate(rated_ethnicity = case_when(avg_diff < 0 ~ "Arab",
                                     avg_diff > 0 ~ "Jewish",
                                     .default = NA))
data_images_choosen_15 <- data_images_15 |>
  dplyr::filter(abs(avg_diff)<15)|>
  mutate(rated_ethnicity = case_when(avg_diff < 0 ~ "Arab",
                                     avg_diff > 0 ~ "Jewish",
                                     .default = NA))

kable(data_images_15, caption = "Difference of means with a cutoff of 15 points")
Difference of means with a cutoff of 15 points
image avg_diff rated_ethnicity
CFD_M-212-N.png 0.0000000 Ambiguous
CFD_M-242-N.png 0.0217391 Ambiguous
CFD_M-211-N.png 0.0434783 Ambiguous
CFD_M-234-N.png 0.7173913 Ambiguous
IFD_M-086-N.png 0.8913043 Ambiguous
CFD_M-227-N.png 1.3260870 Ambiguous
IFD_M-018-N.png 1.3260870 Ambiguous
CFD_M-214-N.png 2.1304348 Ambiguous
CFD_M-236-N.png 2.2391304 Ambiguous
IFD_M-105-N.png 2.5869565 Ambiguous
CFD_M-206-N.png 3.6304348 Ambiguous
CFD_M-218-N.png 4.2391304 Ambiguous
CFD_M-220-N.png 4.3695652 Ambiguous
CFD_M-253-N.png 4.7391304 Ambiguous
CFD_M-248-N.png 5.1956522 Ambiguous
IFD_M-421-N.png 5.2826087 Ambiguous
IFD_M-132-N.png 5.6086957 Ambiguous
IFD_M-419-N.png 6.2608696 Ambiguous
CFD_M-237-N.png 6.4130435 Ambiguous
IFD_M-135-N.png 8.8043478 Ambiguous
IFD_M-108-N.png 9.0000000 Ambiguous
CFD_M-216-N.png 9.3695652 Ambiguous
IFD_M-036-N.png 9.4565217 Ambiguous
IFD_M-416-N.png 9.9130435 Ambiguous
CFD_M-224-N.png 9.9347826 Ambiguous
CFD_M-243-N.png 10.0217391 Ambiguous
CFD_M-247-N.png 10.7608696 Ambiguous
IFD_M-117-N.png 10.8043478 Ambiguous
IFD_M-122-N.png 11.0217391 Ambiguous
CFD_M-229-N.png 11.1086957 Ambiguous
IFD_M-067-N.png 11.1521739 Ambiguous
CFD_M-231-N.png 11.3478261 Ambiguous
IFD_M-100-N.png 11.3913043 Ambiguous
CFD_M-225-N.png 11.5217391 Ambiguous
IFD_M-121-N.png 11.9347826 Ambiguous
CFD_M-204-N.png 13.4130435 Ambiguous
CFD_M-222-N.png 14.3043478 Ambiguous
IFD_M-418-N.png 14.4130435 Ambiguous
IFD_M-033-N.png 14.5652174 Ambiguous
IFD_M-424-N.png 15.0000000 Ambiguous
IFD_M-441-N.png 15.4565217 Jewish
CFD_M-251-N.png 15.5652174 Jewish
IFD_M-136-N.png 15.6739130 Arab
IFD_M-021-N.png 15.8478261 Jewish
CFD_M-221-N.png 16.6956522 Jewish
CFD_M-213-N.png 17.2391304 Arab
CFD_M-230-N.png 17.5434783 Arab
CFD_M-200-N.png 17.9782609 Jewish
IFD_M-015-N.png 18.3260870 Jewish
IFD_M-075-N.png 19.5652174 Arab
IFD_M-113-N.png 19.5652174 Jewish
IFD_M-062-N.png 19.5869565 Arab
IFD_M-420-N.png 19.6086957 Arab
CFD_M-246-N.png 19.8478261 Arab
CFD_M-223-N.png 19.9130435 Arab
IFD_M-044-N.png 20.5217391 Arab
IFD_M-087-N.png 21.6304348 Jewish
CFD_M-252-N.png 22.3260870 Arab
IFD_M-051-N.png 23.0217391 Arab
IFD_M-042-N.png 23.1086957 Arab
IFD_M-035-N.png 23.4130435 Jewish
CFD_M-239-N.png 24.5000000 Arab
CFD_M-238-N.png 25.4565217 Arab
CFD_M-210-N.png 26.6739130 Arab
IFD_M-017-N.png 27.1956522 Jewish
IFD_M-111-N.png 28.5000000 Arab
CFD_M-232-N.png 28.6739130 Arab
IFD_M-114-N.png 29.5652174 Jewish
IFD_M-032-N.png 29.6304348 Arab
IFD_M-097-N.png 29.7826087 Arab
IFD_M-084-N.png 29.8478261 Arab
IFD_M-020-N.png 31.1521739 Arab
CFD_M-250-N.png 31.6304348 Jewish
CFD_M-235-N.png 32.0217391 Arab
IFD_M-049-N.png 33.4347826 Arab
IFD_M-028-N.png 36.5217391 Jewish
CFD_M-201-N.png 39.1086957 Jewish
IFD_M-069-N.png 41.6956522 Arab
IFD_M-423-N.png 44.1086957 Arab
IFD_M-107-N.png 49.4130435 Arab
CFD_M-202-N.png 50.2608696 Arab
IFD_M-066-N.png 60.5000000 Arab
IFD_M-039-N.png 61.4565217 Arab
IFD_M-045-N.png 61.7391304 Arab
IFD_M-046-N.png 70.9347826 Arab
Code
saveRDS(data_images_15, file = "C:/Users/97252/Documents/GitHub/face_cater/PILOT- 1 face categorization/Data/final_data.rdx")

Summary Table:

Code
summary_table_15 <- data_images_15 |>
  count(rated_ethnicity) |>
  spread(key = rated_ethnicity, value = n)

# Print the summary table
print(summary_table_15)
# A tibble: 1 × 3
  Ambiguous  Arab Jewish
      <int> <int>  <int>
1        40    31     14

Difference of means per image by display with a cutoff of with a cutoff of 20 points difference between the ratings

Code
data_images_20<- data_participants |>
  group_by(image, Participant_Private_ID) |>
  #dplyr::filter(!(Participant_Private_ID %in% exclude_participant_IDs))|>
  dplyr::summarize(
    task_Jewish = mean(Response[display == "task_Jewish"], na.rm = TRUE),
    task_Arab = mean(Response[display == "task_Arab"], na.rm = TRUE),
    .groups = 'drop'  ) |> # Calculate the difference in ratings for each participant and image
  mutate(diff_per_participant = task_Jewish - task_Arab) |>
  # Aggregate at the image level
  group_by(image) |>
  dplyr::summarize(
    avg_diff = mean(diff_per_participant, na.rm = TRUE),
    .groups = 'drop') |># Classify based on the average difference
  mutate(
    rated_ethnicity = case_when(
      avg_diff < -20 ~ "Arab",
      avg_diff > 20 ~ "Jewish",
      TRUE ~ "Ambiguous"
    )
  )|>
  mutate(avg_diff = abs(avg_diff)) |>
  arrange(avg_diff)

data_images_big_diff_20 <- data_images_20 |>
  dplyr::filter(abs(avg_diff) >= 20) |>
  mutate(rated_ethnicity = case_when(avg_diff < 0 ~ "Arab",
                                     avg_diff > 0 ~ "Jewish",
                                     .default = NA))
data_images_choosen_20 <- data_images_20 |>
  dplyr::filter(abs(avg_diff)<20)|>
  mutate(rated_ethnicity = case_when(avg_diff < 0 ~ "Arab",
                                     avg_diff > 0 ~ "Jewish",
                                     .default = NA))

kable(data_images_20, caption = "Difference of means with a cutoff of 20 points")
Difference of means with a cutoff of 20 points
image avg_diff rated_ethnicity
CFD_M-212-N.png 0.0000000 Ambiguous
CFD_M-242-N.png 0.0217391 Ambiguous
CFD_M-211-N.png 0.0434783 Ambiguous
CFD_M-234-N.png 0.7173913 Ambiguous
IFD_M-086-N.png 0.8913043 Ambiguous
CFD_M-227-N.png 1.3260870 Ambiguous
IFD_M-018-N.png 1.3260870 Ambiguous
CFD_M-214-N.png 2.1304348 Ambiguous
CFD_M-236-N.png 2.2391304 Ambiguous
IFD_M-105-N.png 2.5869565 Ambiguous
CFD_M-206-N.png 3.6304348 Ambiguous
CFD_M-218-N.png 4.2391304 Ambiguous
CFD_M-220-N.png 4.3695652 Ambiguous
CFD_M-253-N.png 4.7391304 Ambiguous
CFD_M-248-N.png 5.1956522 Ambiguous
IFD_M-421-N.png 5.2826087 Ambiguous
IFD_M-132-N.png 5.6086957 Ambiguous
IFD_M-419-N.png 6.2608696 Ambiguous
CFD_M-237-N.png 6.4130435 Ambiguous
IFD_M-135-N.png 8.8043478 Ambiguous
IFD_M-108-N.png 9.0000000 Ambiguous
CFD_M-216-N.png 9.3695652 Ambiguous
IFD_M-036-N.png 9.4565217 Ambiguous
IFD_M-416-N.png 9.9130435 Ambiguous
CFD_M-224-N.png 9.9347826 Ambiguous
CFD_M-243-N.png 10.0217391 Ambiguous
CFD_M-247-N.png 10.7608696 Ambiguous
IFD_M-117-N.png 10.8043478 Ambiguous
IFD_M-122-N.png 11.0217391 Ambiguous
CFD_M-229-N.png 11.1086957 Ambiguous
IFD_M-067-N.png 11.1521739 Ambiguous
CFD_M-231-N.png 11.3478261 Ambiguous
IFD_M-100-N.png 11.3913043 Ambiguous
CFD_M-225-N.png 11.5217391 Ambiguous
IFD_M-121-N.png 11.9347826 Ambiguous
CFD_M-204-N.png 13.4130435 Ambiguous
CFD_M-222-N.png 14.3043478 Ambiguous
IFD_M-418-N.png 14.4130435 Ambiguous
IFD_M-033-N.png 14.5652174 Ambiguous
IFD_M-424-N.png 15.0000000 Ambiguous
IFD_M-441-N.png 15.4565217 Ambiguous
CFD_M-251-N.png 15.5652174 Ambiguous
IFD_M-136-N.png 15.6739130 Ambiguous
IFD_M-021-N.png 15.8478261 Ambiguous
CFD_M-221-N.png 16.6956522 Ambiguous
CFD_M-213-N.png 17.2391304 Ambiguous
CFD_M-230-N.png 17.5434783 Ambiguous
CFD_M-200-N.png 17.9782609 Ambiguous
IFD_M-015-N.png 18.3260870 Ambiguous
IFD_M-075-N.png 19.5652174 Ambiguous
IFD_M-113-N.png 19.5652174 Ambiguous
IFD_M-062-N.png 19.5869565 Ambiguous
IFD_M-420-N.png 19.6086957 Ambiguous
CFD_M-246-N.png 19.8478261 Ambiguous
CFD_M-223-N.png 19.9130435 Ambiguous
IFD_M-044-N.png 20.5217391 Arab
IFD_M-087-N.png 21.6304348 Jewish
CFD_M-252-N.png 22.3260870 Arab
IFD_M-051-N.png 23.0217391 Arab
IFD_M-042-N.png 23.1086957 Arab
IFD_M-035-N.png 23.4130435 Jewish
CFD_M-239-N.png 24.5000000 Arab
CFD_M-238-N.png 25.4565217 Arab
CFD_M-210-N.png 26.6739130 Arab
IFD_M-017-N.png 27.1956522 Jewish
IFD_M-111-N.png 28.5000000 Arab
CFD_M-232-N.png 28.6739130 Arab
IFD_M-114-N.png 29.5652174 Jewish
IFD_M-032-N.png 29.6304348 Arab
IFD_M-097-N.png 29.7826087 Arab
IFD_M-084-N.png 29.8478261 Arab
IFD_M-020-N.png 31.1521739 Arab
CFD_M-250-N.png 31.6304348 Jewish
CFD_M-235-N.png 32.0217391 Arab
IFD_M-049-N.png 33.4347826 Arab
IFD_M-028-N.png 36.5217391 Jewish
CFD_M-201-N.png 39.1086957 Jewish
IFD_M-069-N.png 41.6956522 Arab
IFD_M-423-N.png 44.1086957 Arab
IFD_M-107-N.png 49.4130435 Arab
CFD_M-202-N.png 50.2608696 Arab
IFD_M-066-N.png 60.5000000 Arab
IFD_M-039-N.png 61.4565217 Arab
IFD_M-045-N.png 61.7391304 Arab
IFD_M-046-N.png 70.9347826 Arab
Code
saveRDS(data_images_20, file = "C:/Users/97252/Documents/GitHub/face_cater/PILOT- 1 face categorization/Data/final_data.rdx")

Summary Table:

Code
summary_table_20 <- data_images_20 |>
  count(rated_ethnicity) |>
  spread(key = rated_ethnicity, value = n)

# Print the summary table
print(summary_table_20)
# A tibble: 1 × 3
  Ambiguous  Arab Jewish
      <int> <int>  <int>
1        55    23      7

Difference of means per image by display and by order of conditions:

# Adjusting data_images to consider the effect of condition order
data_images_conditions <- data_participants |>
  group_by(image, Participant_Private_ID) |>
  #dplyr::filter(!(Participant_Private_ID %in% exclude_participant_IDs))|> # Uncomment if you need to exclude certain participants
  summarize(
    task_Jewish = mean(Response[display == "task_Jewish"], na.rm = TRUE),
    task_Arab = mean(Response[display == "task_Arab"], na.rm = TRUE),
    order_of_conditions = first(order_of_conditions), # Assuming each participant sees each image only once under each condition
    .groups = 'drop'
  ) |>
  mutate(diff_per_participant = task_Jewish - task_Arab) |>
  group_by(image, order_of_conditions) |> # Group also by order_of_conditions to analyze this factor
  summarize(
    avg_diff = mean(diff_per_participant, na.rm = TRUE),
    .groups = 'drop'
  ) |>
  mutate(
    rated_ethnicity = case_when(
      avg_diff < -10 ~ "Arab",
      avg_diff > 10 ~ "Jewish",
      TRUE ~ "Ambiguous"
    )
  ) |>
  arrange(order_of_conditions, abs(avg_diff)) # Sort by order of conditions and then by the size of differences

Visualization with ggplot2:

Data_set arrangment

  • Import and clean demographic data.
  • Perform necessary transformations and renamings of demographic variables.
  • Generate summary statistics and visualizations for demographic variables such as age, gender, religiosity, education, and socio-economic status.
Code
data_demo <- data_demo |>
  colnames_to_underscores() |>
  dplyr::filter(!(Question_Key %in% c("BEGIN QUESTIONNAIRE", "END QUESTIONNAIRE"))) |>
  dplyr::filter(Event_Index != "END OF FILE") |>
  select(Participant_Private_ID, Question_Key, Response) |>
  pivot_wider(names_from = Question_Key, values_from = Response)
Code
demo_wide_clean <- data_demo |>
  mutate(gender = case_when(`gender-quantised` == "1" ~ "man",
                            `gender-quantised` == "2" ~ "woman")) |>
  select(-`gender-quantised`, -`gender-quantised`, -`gender-text`, -`ethnic-text`, -`religiosity-quantised`, -`scale_of_SES-quantised`, -`age-quantised`) |>
  mutate(Participant_Private_ID = factor(Participant_Private_ID),
         age = as.numeric(age),
         children = as.numeric(children),
         scale_of_SES = as.numeric(scale_of_SES))

Rename columns

Code
demo_wide_clean <- demo_wide_clean |>
  rename(ethnic = `ethnic-1`, SES = scale_of_SES, comment = `response-7`, ethnic2 = `ethnic-4`, )

separatlly visualizations and summary statistics for demographic variables

Age

Code
ggplot(demo_wide_clean, aes(x = age)) +
  geom_histogram(bins = 50) +
  scale_x_continuous(breaks = seq(17, 71, 2)) +
  theme_classic()

Summary Table for AGE stat:

Code
Age_stats <- get_summary_stats(as.data.frame(as.numeric(demo_wide_clean$age)))

if (!requireNamespace("knitr", quietly = TRUE)) {
  install.packages("knitr")
}

library(knitr)

# Generate a nice table using kable
kable(Age_stats, caption = "Summary Statistics for Age", format = "markdown")
Summary Statistics for Age
as.numeric(demo_wide_clean$age)
Min. :18.00
1st Qu.:23.00
Median :30.00
Mean :29.49
3rd Qu.:33.00
Max. :72.00
NA’s :2
Code
Age_stats_df <- data.frame(
  Statistic = c("Mean", "Median", "SD", "Min", "Max"),
  Value = c(mean(demo_wide_clean$age, na.rm = TRUE), 
            median(demo_wide_clean$age, na.rm = TRUE), 
            sd(demo_wide_clean$age, na.rm = TRUE), 
            min(demo_wide_clean$age, na.rm = TRUE), 
            max(demo_wide_clean$age, na.rm = TRUE))
)

kable(Age_stats_df, caption = "Summary Statistics for Age", format = "markdown")
Summary Statistics for Age
Statistic Value
Mean 29.4898
Median 30.0000
SD 10.2533
Min 18.0000
Max 72.0000

Gender

Code
ggplot(demo_wide_clean, aes(x = gender)) +
  geom_histogram(stat = "count") +
  scale_y_continuous(breaks = seq(0, 200, 10)) +
  theme_classic()

Code
male_per <- sum(demo_wide_clean$gender == "man", na.rm = TRUE) / 
            sum(!is.na(demo_wide_clean$gender))

female_per <- sum(demo_wide_clean$gender == "woman", na.rm = T) /
   sum(!is.na(demo_wide_clean$gender))
a_baniari_per <- sum(demo_wide_clean$gender == "לא בינארי", na.rm = T)/
  sum(!is.na(demo_wide_clean$gender))

Religiosity

Code
ggplot(demo_wide_clean, aes(x = religiosity)) +
  geom_histogram(stat = "count") +
  scale_y_continuous(breaks = seq(0, 200, 10)) +
  theme_classic()

Summary Table for Religiosity stat:

Code
religiosity_stats <- get_summary_stats(as.data.frame(as.numeric(demo_wide_clean$religiosity)))
Age_stats <- get_summary_stats(as.data.frame(as.numeric(demo_wide_clean$age)))

if (!requireNamespace("knitr", quietly = TRUE)) {
  install.packages("knitr")
}

library(knitr)
str(demo_wide_clean)
tibble [51 × 16] (S3: tbl_df/tbl/data.frame)
 $ Participant_Private_ID: Factor w/ 51 levels "10514858","10515072",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ age                   : num [1:51] 19 72 22 19 27 22 19 26 27 29 ...
 $ gender                : chr [1:51] "man" "man" "man" "woman" ...
 $ ethnic                : chr [1:51] "ישראלי/ת" "ישראלי/ת" NA "ישראלי/ת" ...
 $ ethnic2               : chr [1:51] "יהודי/ת" "יהודי/ת" NA "יהודי/ת" ...
 $ ethnic-8              : chr [1:51] "חרד/ית" NA "חרד/ית" NA ...
 $ religiosity           : chr [1:51] "9" "3" "8" "1" ...
 $ education             : chr [1:51] "למדתי לימודים מתקדמים מעבר לתואר ראשון" "השלמתי תואר ראשון באוניברסיטה" "השלמתי בית ספר יסודי" "השלמתי בית ספר תיכון" ...
 $ education-quantised   : chr [1:51] "7" "6" "2" "4" ...
 $ language              : chr [1:51] "כן" "כן" "כן" "כן" ...
 $ language -quantised   : chr [1:51] "1" "1" "1" "1" ...
 $ vision                : chr [1:51] "ראייה מתוקנת (משקפיים/עדשות)" "ראייה מתוקנת (משקפיים/עדשות)" "ראייה מתוקנת (משקפיים/עדשות)" "כן" ...
 $ vision-quantised      : chr [1:51] "2" "2" "2" "1" ...
 $ children              : num [1:51] 1 2 1 0 0 0 0 0 0 3 ...
 $ SES                   : num [1:51] 6 8 5 6 6 7 7 5 6 4 ...
 $ comment               : chr [1:51] NA NA NA NA ...
Code
demo_wide_clean$religiosity <- as.numeric(as.character(demo_wide_clean$religiosity))
# Generate a nice table using kable
kable(religiosity_stats, caption = "Summary Statistics for religiosity", format = "markdown")
Summary Statistics for religiosity
as.numeric(demo_wide_clean$religiosity)
Min. : 1.000
1st Qu.: 3.000
Median : 6.000
Mean : 5.588
3rd Qu.: 8.000
Max. :10.000
Code
relig_stats_df <- data.frame(
  Statistic = c("Mean", "Median", "SD", "Min", "Max"),
  Value = c(mean(demo_wide_clean$religiosity, na.rm = TRUE), 
            median(demo_wide_clean$religiosity, na.rm = TRUE), 
            sd(demo_wide_clean$religiosity, na.rm = TRUE), 
            min(demo_wide_clean$religiosity, na.rm = TRUE), 
            max(demo_wide_clean$religiosity, na.rm = TRUE))
)

kable(relig_stats_df, caption = "Summary Statistics for Religiosity", format = "markdown")
Summary Statistics for Religiosity
Statistic Value
Mean 5.588235
Median 6.000000
SD 3.093066
Min 1.000000
Max 10.000000
Code
demo_table <- flextable::summarizor(demo_wide_clean[,-1], overall_label = "overall") |>
  flextable::as_flextable(sep_w = 0, spread_first_col = T)

Education

Code
education_levels <- c('1' = 'Part of Primary School', '2' = 'Finished Primary School', '3' =  'Part of High School', '4' = 'Finished High School', '5' = 'In Bachelor\'s Degree', '6' = 'Finished Bachelor\'s Degree', '7' = 'Master\'s Degree', '8' = 'Prefer not to answer')

demo_wide_clean$education <- factor(demo_wide_clean$`education-quantised`, levels = c('1', '2', '3', '4', '5', '6', '7', '8'), labels = c('Part of Primary School', 'Finished Primary School', 'Part of High School', 'Finished High School', 'In Bachelor\'s Degree', 'Finished Bachelor\'s Degree', 'Master\'s Degree', 'Prefer not to answer'))


ggplot(demo_wide_clean, aes(x = education)) +
  geom_bar() +
  scale_y_continuous(breaks = seq(0, 200, 10)) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

SES

Code
ggplot(drop_na(demo_wide_clean, SES), aes(x = SES)) +
  geom_histogram(stat = "count", binwidth = 1) +
  stat_bin(binwidth = 1, geom = 'text', color = 'white', aes(label = after_stat(count)),
           position = position_stack(vjust = 0.5)) +
  scale_x_continuous(breaks = c(1:10)) +
  scale_y_continuous(breaks = seq(0, 160, 10)) +
  labs(title = "On a scale of 1-10 how would you rate your Social-Economic status?",
       subtitle = "1 = Lowest status, 10 = Highest status",
       y = "Number of participants",
       x = "") +
  theme_classic() +
  theme(plot.title = element_text(family = "serif", hjust = 0.5, size = 16),
        plot.subtitle = element_text(family = "serif", hjust = 0.5, size = 10))

Summary Table for SES stat:

Code
SES_stats <- get_summary_stats(as.data.frame(as.numeric(demo_wide_clean$SES)))

# Generate a nice table using kable
kable(SES_stats, caption = "Summary Statistics for SES", format = "markdown")
Summary Statistics for SES
as.numeric(demo_wide_clean$SES)
Min. :2.000
1st Qu.:5.000
Median :6.000
Mean :5.824
3rd Qu.:7.000
Max. :8.000
Code
SES_stats_df <- data.frame(
  Statistic = c("Mean", "Median", "SD", "Min", "Max"),
  Value = c(mean(demo_wide_clean$SES, na.rm = TRUE), 
            median(demo_wide_clean$SES, na.rm = TRUE), 
            sd(demo_wide_clean$SES, na.rm = TRUE), 
            min(demo_wide_clean$SES, na.rm = TRUE), 
            max(demo_wide_clean$SES, na.rm = TRUE))
)

# Now generate the table with kable
kable(SES_stats_df, caption = "Summary Statistics for SES", format = "markdown")
Summary Statistics for SES
Statistic Value
Mean 5.823529
Median 6.000000
SD 1.260252
Min 2.000000
Max 8.000000

References

Bakker, Marjan, and Jelte M Wicherts. 2014. “Outlier Removal, Sum Scores, and the Inflation of the Type i Error Rate in Independent Samples t Tests: The Power of Alternatives and Recommendations.” Psychological Methods 19 (3): 409.